Disaster Relief Project

K-Folds Out of Sampling Performance ( + RF/SVM )

Method KNN(k=5) LDA QDA LR RF(tuning parameter: mtry=3; ntrees=500) SVM(tuning parameter: cost=10; gamma=1; sigma = 8.691262; C = 1)
Accuracy 99.6% 98.3% 99.4% 99.5% 99.7% 99.7%
AUC 99.8% 98.7% 99.8% 99.8% 99.0% 99.4%
ROC in tabs in tabs in tabs in tabs in tabs in tabs
Threshold .66 .85 .40 .29 .50 .52
Sensitivity 93.1% 74.0% 85.1% 91.0% 95.2% 94.0%
Specificity 99.9% 99.2% 99.6% 99.8% 99.8% 99.9%
FDR 2.2% 24.6% 1.7% 4.7% 4.8% .028%
Precision 97.8% 75.4% 98.3% 95.3% 95.2% 97.2%

K-Folds Process

Setup
Values will differ from last submission due to new random sampling of the data for train/test sets.
library(tidyverse)
library(dplyr)
library(caret)
library(class)
library(yardstick)
library(plotly)
library(boot)
library(pROC)
library(glmnet)
library(purrr)
library(gridExtra)
library(randomForest)
library(e1071)
install.packages("kernlab")
#reading in the data
data <- read.csv("HaitiPixels.csv", header=TRUE ,sep=",")

data <- data %>%
  mutate(BlueClass = as.factor(ifelse(Class=="Blue Tarp","Yes", "No")))
#check the levels just specified
levels(data$BlueClass)
## [1] "No"  "Yes"
#set data var to be columns 2-5 of the set
data = data[c(2:5)]
data <- data %>% mutate(id = row_number())
#check addition
head(data$id)
## [1] 1 2 3 4 5 6
#shuffle data to fairly split into test / train
shuffleddata = sample_n(data, nrow(data))
#check that it has been shuffled
head(shuffleddata$id) #different then the first six lines of the csv file
## [1] 52330 20209 39644 38201 16443 12272
#remove the id column
shuffleddata = shuffleddata[c(1:4)]
#split the data into test and train for use in our upcoming models 
#using a 10k subset for faster knn function execution and file freezing issues
samp <- 1:10000 
samp2 <- 10001:20000
train<-shuffleddata[samp,]
test  <- shuffleddata[samp2,]
head(train)
head(test)
KNN
#model training rules for all models
train_control <- caret::trainControl(method="cv", number=10, returnResamp='all', classProbs=TRUE, savePredictions='final')

#KNN model
system.time({
knnmod=train(BlueClass~Red+Green+Blue,data=train,trControl=train_control,method="knn",preProcess = c("center","scale"), tuneGrid = expand.grid(k = c(1:15)))
})
##    user  system elapsed 
##  20.873   0.105  21.140
knnmod
## k-Nearest Neighbors 
## 
## 10000 samples
##     3 predictor
##     2 classes: 'No', 'Yes' 
## 
## Pre-processing: centered (3), scaled (3) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 9001, 9000, 8999, 9000, 9001, 8999, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    1  0.9966007  0.9477581
##    2  0.9961002  0.9402912
##    3  0.9960004  0.9383623
##    4  0.9963002  0.9430485
##    5  0.9963006  0.9433100
##    6  0.9962001  0.9413603
##    7  0.9962001  0.9406740
##    8  0.9962998  0.9425493
##    9  0.9958995  0.9357883
##   10  0.9957995  0.9343295
##   11  0.9960996  0.9392304
##   12  0.9958998  0.9360504
##   13  0.9957000  0.9328276
##   14  0.9957998  0.9345563
##   15  0.9954998  0.9297910
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 1.
#plot KNN model
plot(knnmod)

#set prediction, probability, and cv score variables in case needed
knnmod_pred <- predict(knnmod, test,'raw')
knnmod_prob <- predict(knnmod, test,'prob')

knnmod_scored <- cbind(test, knnmod_pred, knnmod_prob)
#AUC/ROC
options(yardstick.event_first=FALSE)

#area under the curve
knn_auc = knnmod_prob %>%
  yardstick::roc_auc(truth=test$BlueClass, Yes)
## Warning: The `yardstick.event_first` option has been deprecated as of yardstick 0.0.7 and will be completely ignored in a future version.
## Instead, set the following argument directly in the metric function:
## `options(yardstick.event_first = TRUE)`  -> `event_level = 'first'` (the default)
## `options(yardstick.event_first = FALSE)` -> `event_level = 'second'`
## This warning is displayed once per session.
knn_auc
#ROC curve + plot
ROC_curve<-knnmod_prob %>%
  yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity,y=sensitivity))+
  geom_line() + geom_point() +
  geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
  xlab("one_minus_specificity\n(false positive rate)")+
  ggtitle('KNN ROC curve')
ggplotly(ROC_curve_plot)
#set threshold
knnmod_pred2 <- knnmod$pred %>% 
  #the accuracy doesn't improve by reducing the threshold any further than .66, 99.7% best. 
  mutate(prediction = ifelse(Yes>.66, 'Yes', 'No')) %>%
  mutate(prediction = factor(prediction, levels=c('No','Yes')))

#confusion matrix
confusionMatrix(knnmod_pred2$prediction, knnmod_pred2$obs, positive="Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  9648   16
##        Yes   18  318
##                                           
##                Accuracy : 0.9966          
##                  95% CI : (0.9953, 0.9976)
##     No Information Rate : 0.9666          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9475          
##                                           
##  Mcnemar's Test P-Value : 0.8638          
##                                           
##             Sensitivity : 0.9521          
##             Specificity : 0.9981          
##          Pos Pred Value : 0.9464          
##          Neg Pred Value : 0.9983          
##              Prevalence : 0.0334          
##          Detection Rate : 0.0318          
##    Detection Prevalence : 0.0336          
##       Balanced Accuracy : 0.9751          
##                                           
##        'Positive' Class : Yes             
## 
LDA
#LDA model
system.time({
ldamod=train(BlueClass~Red+Green+Blue,data=train,trControl=train_control,method="lda",preProcess = c("center","scale"), family="binomial")
})
##    user  system elapsed 
##   1.084   0.070   1.366
ldamod
## Linear Discriminant Analysis 
## 
## 10000 samples
##     3 predictor
##     2 classes: 'No', 'Yes' 
## 
## Pre-processing: centered (3), scaled (3) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 9000, 9000, 9001, 9000, 9000, 9001, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9846001  0.7717723
#set prediction, probability, and cv score variables in case needed
ldamod_pred <- predict(ldamod, test,'raw')
ldamod_prob <- predict(ldamod, test,'prob')

ldamod_scored <- cbind(test, ldamod_pred, ldamod_prob)
#AUC/ROC
options(yardstick.event_first=FALSE)

#area under the curve
lda_auc = ldamod_prob %>%
  yardstick::roc_auc(truth=test$BlueClass, Yes)
lda_auc
#ROC curve + plot
ROC_curve2<-ldamod_prob %>%
  yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

ROC_curve_plot2 <- ROC_curve2 %>%
  ggplot(aes(x=one_minus_specificity,y=sensitivity))+
  geom_line() + geom_point() +
  geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
  xlab("one_minus_specificity\n(false positive rate)")+
  ggtitle('LDA ROC curve')
ggplotly(ROC_curve_plot2)
#set new threshold
ldamod_pred2 <- ldamod$pred %>% 
  #the accuracy doesn't improve by reducing the threshold any further than .85, 98.6% best.
  mutate(prediction = ifelse(Yes>.85, 'Yes', 'No')) %>%
  mutate(prediction = factor(prediction, levels=c('No','Yes')))

#new threshold matrix
confusionMatrix(ldamod_pred2$prediction, ldamod_pred2$obs, positive="Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  9597   81
##        Yes   69  253
##                                           
##                Accuracy : 0.985           
##                  95% CI : (0.9824, 0.9873)
##     No Information Rate : 0.9666          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7636          
##                                           
##  Mcnemar's Test P-Value : 0.3691          
##                                           
##             Sensitivity : 0.7575          
##             Specificity : 0.9929          
##          Pos Pred Value : 0.7857          
##          Neg Pred Value : 0.9916          
##              Prevalence : 0.0334          
##          Detection Rate : 0.0253          
##    Detection Prevalence : 0.0322          
##       Balanced Accuracy : 0.8752          
##                                           
##        'Positive' Class : Yes             
## 
QDA
#QDA model
system.time({
qdamod=train(BlueClass~Red+Green+Blue,data=train,trControl=train_control,method="qda",preProcess = c("center","scale"), family="binomial")
})
##    user  system elapsed 
##   1.064   0.002   1.083
qdamod
## Quadratic Discriminant Analysis 
## 
## 10000 samples
##     3 predictor
##     2 classes: 'No', 'Yes' 
## 
## Pre-processing: centered (3), scaled (3) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 9000, 9000, 9000, 9000, 8999, 9001, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9945991  0.9091862
##set prediction, probability, and cv score variables in case needed
qdamod_pred <- predict(qdamod, test,'raw')
qdamod_prob <- predict(qdamod, test,'prob')

qdamod_scored <- cbind(test, qdamod_pred, qdamod_prob)
#AUC/ROC
options(yardstick.event_first=FALSE)

#area under the curve
qda_auc = qdamod_prob %>%
  yardstick::roc_auc(truth=test$BlueClass, Yes)
qda_auc
#ROC curve + plot
ROC_curve3<-qdamod_prob %>%
  yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

ROC_curve_plot3 <- ROC_curve3 %>%
  ggplot(aes(x=one_minus_specificity,y=sensitivity))+
  geom_line() + geom_point() +
  geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
  xlab("one_minus_specificity\n(false positive rate)")+
  ggtitle('LDA ROC curve')
ggplotly(ROC_curve_plot3)
#set new threshold
qdamod_pred2 <- qdamod$pred %>% 
  #the accuracy doesn't improve by reducing the threshold any further than .40, 99.5% best.
  mutate(prediction = ifelse(Yes>.40, 'Yes', 'No')) %>%
  mutate(prediction = factor(prediction, levels=c('No','Yes')))

#new threshold matrix
confusionMatrix(qdamod_pred2$prediction, qdamod_pred2$obs, positive="Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  9661   47
##        Yes    5  287
##                                           
##                Accuracy : 0.9948          
##                  95% CI : (0.9932, 0.9961)
##     No Information Rate : 0.9666          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9143          
##                                           
##  Mcnemar's Test P-Value : 1.303e-08       
##                                           
##             Sensitivity : 0.8593          
##             Specificity : 0.9995          
##          Pos Pred Value : 0.9829          
##          Neg Pred Value : 0.9952          
##              Prevalence : 0.0334          
##          Detection Rate : 0.0287          
##    Detection Prevalence : 0.0292          
##       Balanced Accuracy : 0.9294          
##                                           
##        'Positive' Class : Yes             
## 
LR
#GLM model
system.time({
glmmod=train(BlueClass~Red+Green+Blue,data=train,trControl=train_control,method="glm",preProcess = c("center","scale"), family="binomial")
})
##    user  system elapsed 
##   1.848   0.014   1.901
glmmod
## Generalized Linear Model 
## 
## 10000 samples
##     3 predictor
##     2 classes: 'No', 'Yes' 
## 
## Pre-processing: centered (3), scaled (3) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 9001, 8999, 9000, 9000, 9000, 8999, ... 
## Resampling results:
## 
##   Accuracy   Kappa   
##   0.9952998  0.924728
##et prediction, probability, and cv score variables in case needed
glmmod_pred <- predict(glmmod, test,'raw')
glmmod_prob <- predict(glmmod, test,'prob')

glmmod_scored <- cbind(test, glmmod_pred, glmmod_prob)
#AUC/ROC
options(yardstick.event_first=FALSE)

#area under the curve
qda_auc = glmmod_prob %>%
  yardstick::roc_auc(truth=test$BlueClass, Yes)
qda_auc
#ROC curve + plot
ROC_curve4<-glmmod_prob %>%
  yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

ROC_curve_plot4 <- ROC_curve4 %>%
  ggplot(aes(x=one_minus_specificity,y=sensitivity))+
  geom_line() + geom_point() +
  geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
  xlab("one_minus_specificity\n(false positive rate)")+
  ggtitle('LDA ROC curve')
ggplotly(ROC_curve_plot4)
#set new threshold
glmmod_pred2 <- glmmod$pred %>% 
  #the accuracy doesn't improve by reducing the threshold any further than .29, 99.6% best.
  mutate(prediction = ifelse(Yes>.29, 'Yes', 'No')) %>%
  mutate(prediction = factor(prediction, levels=c('No','Yes')))

#new threshold matrix
confusionMatrix(glmmod_pred2$prediction, glmmod_pred2$obs, positive="Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  9652   25
##        Yes   14  309
##                                           
##                Accuracy : 0.9961          
##                  95% CI : (0.9947, 0.9972)
##     No Information Rate : 0.9666          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9386          
##                                           
##  Mcnemar's Test P-Value : 0.1093          
##                                           
##             Sensitivity : 0.9251          
##             Specificity : 0.9986          
##          Pos Pred Value : 0.9567          
##          Neg Pred Value : 0.9974          
##              Prevalence : 0.0334          
##          Detection Rate : 0.0309          
##    Detection Prevalence : 0.0323          
##       Balanced Accuracy : 0.9619          
##                                           
##        'Positive' Class : Yes             
## 
RF
#Random Forest model

#Choosing tuning parameters:
#https://discuss.analyticsvidhya.com/t/how-to-decide-no-of-ntrees-in-randomforest/6882/3
#https://rpubs.com/phamdinhkhanh/389752

#Create control function for training with 10 folds and keep 3 folds for training.
train_control <- caret::trainControl(method="cv", number=10, returnResamp='all', classProbs=TRUE, savePredictions='final')

#https://stackoverflow.com/questions/10085806/extracting-specific-columns-from-a-data-frame
df<- train %>%
  select(Red,Green,Blue)

#mtryStart defaults at sqrt(p)
#my available threshold for mtry values is pretty low based on the size of my dataset 
(tuneRF(df,train$BlueClass,mtry = 5, ntree = 500, stepFactor=5, improve=0.05,
       trace=TRUE, plot=TRUE, doBest=TRUE))
## Warning in randomForest.default(x, y, mtry = mtryStart, ntree = ntreeTry, :
## invalid mtry: reset to within valid range
## mtry = 5  OOB error = 0.38% 
## Searching left ...
## mtry = 1     OOB error = 0.32% 
## 0.1578947 0.05 
## Searching right ...
## mtry = 3     OOB error = 0.38% 
## -0.1875 0.05

## 
## Call:
##  randomForest(x = x, y = y, mtry = res[which.min(res[, 2]), 1]) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 0.35%
## Confusion matrix:
##       No Yes class.error
## No  9656  10 0.001034554
## Yes   25 309 0.074850299
#mtry = 3

##------

tunegrid <- expand.grid(.mtry = 3)
modellist <- list()
#train with different ntree parameters and inspect bias/variance tradeoff 
#findtrees1 <- train(BlueClass~Red+Green+Blue, 
 #              data=train,
  #             method = 'rf',
   #            metric = 'Accuracy',
    #           tuneGrid = tunegrid,
     #          trControl = control,
      #         ntree = 50)
#findtrees1

#findtrees2 <- train(BlueClass~Red+Green+Blue, 
 #              data=train,
  #             method = 'rf',
   #            metric = 'Accuracy',
    #           tuneGrid = tunegrid,
     #          trControl = control,
      #         ntree = 100)
#findtrees2

system.time({
RF <- train(BlueClass~Red+Green+Blue, 
               data=train,
               method = 'rf',
               metric = 'Accuracy',
               tuneGrid = tunegrid,
               trControl = train_control,
               ntree = 500)
})
##    user  system elapsed 
##  17.444   1.381  30.799
RF
## Random Forest 
## 
## 10000 samples
##     3 predictor
##     2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 9000, 9000, 9000, 8999, 9000, 9000, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9959004  0.9354476
## 
## Tuning parameter 'mtry' was held constant at a value of 3
##et prediction, probability, and cv score variables in case needed
rfmod_pred <- predict(RF, test,'raw')
rfmod_prob <- predict(RF, test,'prob')
rfmod_scored <- cbind(test, rfmod_pred, rfmod_prob)
#AUC/ROC
options(yardstick.event_first=FALSE)

#area under the curve
rf_auc = rfmod_prob %>%
  yardstick::roc_auc(truth=test$BlueClass, Yes)
rf_auc
#ROC curve + plot
ROC_curve4<-rfmod_prob %>%
  yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

ROC_curve_plot4 <- ROC_curve4 %>%
  ggplot(aes(x=one_minus_specificity,y=sensitivity))+
  geom_line() + geom_point() +
  geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
  xlab("one_minus_specificity\n(false positive rate)")+
  ggtitle('LDA ROC curve')
ggplotly(ROC_curve_plot4)
#set new threshold
rfmod_pred2 <- RF$pred %>% 
  #the accuracy doesn't improve by reducing the threshold any further than .50, 99.7% best.
  mutate(prediction = ifelse(Yes>.50, 'Yes', 'No')) %>%
  mutate(prediction = factor(prediction, levels=c('No','Yes')))

#new threshold matrix
confusionMatrix(rfmod_pred2$prediction, rfmod_pred2$obs, positive="Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  9650   25
##        Yes   16  309
##                                           
##                Accuracy : 0.9959          
##                  95% CI : (0.9944, 0.9971)
##     No Information Rate : 0.9666          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9357          
##                                           
##  Mcnemar's Test P-Value : 0.2115          
##                                           
##             Sensitivity : 0.9251          
##             Specificity : 0.9983          
##          Pos Pred Value : 0.9508          
##          Neg Pred Value : 0.9974          
##              Prevalence : 0.0334          
##          Detection Rate : 0.0309          
##    Detection Prevalence : 0.0325          
##       Balanced Accuracy : 0.9617          
##                                           
##        'Positive' Class : Yes             
## 
SVM
#Choosing tuning parameters:
#Linear
set.seed(1)
tune.out.linear=tune(svm,BlueClass~Red+Green+Blue,data=train,kernel="linear",ranges=list(cost=c(0.001, 0.01, 0.1, 1,5,10,100)))

#Radial
tune.out.radial=tune(svm, BlueClass~Red+Green+Blue,data=train, kernel="radial", ranges=list(cost=c(0.1,1,10,100,1000),gamma=c(0.5,1,2,3,4)))
#lowest error is radial cost 10 gamma 1

#Poly
tune.out.poly=tune(svm,BlueClass~Red+Green+Blue,data=train, kernel="polynomial", ranges=list(cost=c(0.1,1,10,100,1000),degree=c(1,2,3,4,5)))

##------

system.time({
svmmod <- train(BlueClass~Red+Green+Blue, 
               data=train,
               method = 'svmRadial',
               metric = 'Accuracy',
               trControl = train_control,
               cost = 10,
               gamma = 1,
               preProcess = c("center","scale")
               )
})
##    user  system elapsed 
##  23.185   0.115  37.997
#Set prediction, probability, and cv score variables in case needed
svmmod_pred <- predict(svmmod, test,'raw')
svmmod_prob <- predict(svmmod, test,'prob')
svmmod_scored <- cbind(test, svmmod_pred, svmmod_prob)
#AUC/ROC
options(yardstick.event_first=FALSE)

#area under the curve
svmmod_auc = svmmod_prob %>%
  yardstick::roc_auc(truth=test$BlueClass, Yes)
svmmod_auc
#ROC curve + plot
ROC_curve4<-svmmod_prob %>%
  yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

ROC_curve_plot4 <- ROC_curve4 %>%
  ggplot(aes(x=one_minus_specificity,y=sensitivity))+
  geom_line() + geom_point() +
  geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
  xlab("one_minus_specificity\n(false positive rate)")+
  ggtitle('LDA ROC curve')
ggplotly(ROC_curve_plot4)
#set new threshold
svmmod_pred2 <- svmmod$pred %>% 
  #the accuracy doesn't improve by reducing the threshold any further than .52, 99.7% best.
  mutate(prediction = ifelse(Yes>.52, 'Yes', 'No')) %>%
  mutate(prediction = factor(prediction, levels=c('No','Yes')))

#new threshold matrix
confusionMatrix(svmmod_pred2$prediction, svmmod_pred2$obs, positive="Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  9655   25
##        Yes   11  309
##                                          
##                Accuracy : 0.9964         
##                  95% CI : (0.995, 0.9975)
##     No Information Rate : 0.9666         
##     P-Value [Acc > NIR] : < 2e-16        
##                                          
##                   Kappa : 0.9431         
##                                          
##  Mcnemar's Test P-Value : 0.03026        
##                                          
##             Sensitivity : 0.9251         
##             Specificity : 0.9989         
##          Pos Pred Value : 0.9656         
##          Neg Pred Value : 0.9974         
##              Prevalence : 0.0334         
##          Detection Rate : 0.0309         
##    Detection Prevalence : 0.0320         
##       Balanced Accuracy : 0.9620         
##                                          
##        'Positive' Class : Yes            
## 
Tuning parameter selection process
    1. Interpretation of your chosen tuning parameter values:
  • Ntree = 500 indicates that 500 trees were created. I understand that it is important for this number to be substantial to reduce variance, but as we know, that can also lead to bias. There should be a sweet spot where every input row gets predicted at least a few times without overfitting the trees. Mtry=5 indicates that 5 variables were randomly sampled at each split. When mtry=p it can essentially equate to bagging, whereas if its set to 1 it essentially chooses a random variable. I understand it is good to try out a few values that range no smaller than 2 and no larger than p. The gamma=1 svm parameter indicates that a single training example has far reaching influence. A cost = 10 here means we are "paying" a high price for higher accuracy.

    1. Explanation of how your chosen tuning parameter values were chosen:
  • I tried to make this selection process as programmatic as possible. There was a lot of learning and I'm sure there is still a healthy amount that should be corrected within the code but I was glad to find the resources that I did. For mtry I used the tuneRF() function that accepts an initial value for mtry and returns the out-of-bag error for your input value as well as a few surrounding values. I chose mtry because it produced the lowest oob error and went on to choose ntrees from there. At that point I included my mtry=3 into several RF models with ntrees of different values and again looked for the highest accuracy. Ntrees=500 was the winner in that sense, and it appears to be a very commonly used value for that parameter. For the SVM parameters I used our class lab as a guide to run the tune() function for linear, radial and polynomial kernels with several cost, gamma and degree values respectively. The tune function returns a best performer, which was the radial kernel with cost = 10 and gamma = 1. When running the train function the sigma = 8.691262 and C = 1 values were returned as contributing to the highest accuracy and therefore were the most optimal values for those parameters.

Conclusions
    1. A discussion of the best performing algorithm(s) in the cross-validation and hold-out data
  • The best performing algorithm in the cross validation spectrum would be GLM or QDA. Both run in under 1.5 seconds to the user (up to ~15 seconds faster than some of the others) and still have an accuracy over 99.4%. The hold out data general ran very slowly, which could be due to the size of the dataset, but given the consistently high accuracy and easy of use of these two functions I would be even more likely to advocate for them in hold out setting.

    1. A discussion or analysis justifying why your findings above are compatible or reconcilable
  • Yes, of course compatible. Time is always a consideration but not quite as much of a factor in our cross validation. You can nit pick fractions of a second there but generally they run comparably fast. I realized how much that can vary when executing other sets of data, particularly when they become extremely large. 99%+ accuracy, depending on the target and industry, is generally good among the metrics we see. If I had to choose a “desert island” algorithm for training and testing, it would most likely be one of these for being good performers in many scenarios.

    1. A recommendation and rationale regarding which algorithm to use for detection of blue tarps
  • For the detection of blue tarps I’d recommend using the radial kernel svm function. Its valuable to tune your parameters according to your data and how you may define accuracy. Particularly with this pixel data, you could lower the threshold for determining a blue tarp knowing that it goes beyond the original tarp limits, but avoids the possibility of neglecting a human in need. The reason I became partial to these functions in the context of a natural disaster, is because it can give you a starting point for additional tuning. Accuracy and automation are two things I would assume are extremely valuable in the wake of a natural disaster.

    1. A discussion of the relevance of the metrics calculated in the tables to this application context
  • Well I think its telling that I speak mostly about accuracy and time in the context of this project. I think that is a direct nod to the the context of an emergency situation. In a situation where time is not largely a factor or at all, perhaps I would be speaking much more often about sensitivity and the ability to use sensitivity and accuracy and the drivers for choosing an algorithm over the speed/accuracy trade off that a threatening situation can require.

    1. Additional thoughts and questions: I’ve done some research to learn more about how RGB average thresholds are set in order to determine that it is a “green” vegetation area or “red” soil area. And honestly I’m having a hard time aligning the logic. I understand from a paper titled “Geospatial Disaster Response during the Haiti Earthquake: A Case Study Spanning Airborne Deployment, Data Collection, Transfer, Processing, and Dissemination” that a Mahalanobis distance classifier was trained using a collection of known blue tarp pixels, the RGB values of the pixels in the photos were averaged, and that’s how the other pixels are determined to be blue tarp or not to be blue tarp. My issue is largely that the average of those three values is not exclusive to the color. I read that “RGB (Red, Green, Blue) are 8 bit each. The range for each individual colour is 0-255 (as 2^8 = 256 possibilities). The combination range is 256256256. By dividing by 255, the 0-255 range can be described with a 0.0-1.0 range where 0.0 means 0 (0x00) and 1.0 means 255 (0xFF).” So how could the average of these three values indicate something is blue over it being red, when the average for something being ‘true’ red (255,0,0) would have the same average as something being ‘true’ green or blue at (0,255,0) or (0,0,255) respectively. It seems more logical for there to be a lower bound limit to what could be considered blue, in addition to it being at least 1.2x the values of the other two for example. There are studies around accessibility and exactly what shade behins to be unseeable by someone who is color blind, how do they determine their thresholds? That will be my next rabbit hole.